# Your code here feel free to use multiple cells and include markdown, graphs, latex or equations as you see fit.
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import numpy as np
import cufflinks as cf
import plotly.figure_factory as ff
cf.go_offline()
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import matplotlib.pyplot as plt
import seaborn as sns
All date data from the sample span over 2 months, mainly January. (Bar graph shows only Dates of First Trip which had the widest range of dates).
#examine different Time Delta's and create duration data
data['dur_signup-bgc'] = data['bgc_date']-data['signup_date']
data['dur_bgc-vadd'] = data['vehicle_added_date']- data['bgc_date']
data['dur_signup-vadd'] = data['vehicle_added_date']- data['signup_date']
data['dur_bgc-trip'] = data['first_completed_date']- data['bgc_date']
data['dur_vadd-trip'] = data['first_completed_date']- data['vehicle_added_date']
data['dur_signup-trip'] = data['first_completed_date']- data['signup_date']
data[['dur_signup-bgc','dur_signup-vadd','dur_bgc-vadd','dur_bgc-trip','dur_vadd-trip','dur_signup-trip']].describe(include='all')
This table describes informational metrics,quartile,and general distribution of our time data.
pie = pd.DataFrame(data.loc[data['first_trip_completed']==True]['city_name'].value_counts())
pie
pie = pd.DataFrame(data.loc[data['first_trip_completed']==True]['signup_os'].value_counts())
pie.rename_axis('uv').reset_index()
#pie.iplot(labels = pie.iloc[1], values = pie.signup_os,kind='pie')
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']
pied = [go.Pie(labels = pie.index, values = pie.signup_os)]
fig = go.Figure(data=pied, layout = go.Layout(title= 'Incoming Channels: Sign-Up OS'))
fig['data'].update({'text':pie.index,'textposition':'auto','marker':{'colors':colors}})
py.iplot(fig)
fig = tls.make_subplots(rows=2, cols=1,subplot_titles=('Top Makes Uber Drivers Prefer','Top Models Uber Drivers Prefer'))
fig1 = data.loc[data['first_trip_completed']==True]['vehicle_make'].value_counts()[:6].iplot(asFigure = True,title='Top Car Manufacturers Uber Drivers Add',kind='bar')
fig.append_trace(fig1['data'][0],1,1)
fig2 = data.loc[data['first_trip_completed']==True]['vehicle_model'].value_counts()[:6].iplot(asFigure = True, title='Top Models Uber Drivers Prefer to Add',kind='bar')
fig.append_trace(fig2['data'][0],2,1)
py.iplot(fig)
Data Cleaning
print('Out of place Values:')
print('SignUp' ,len(data[data['dur_signup-trip'].dt.days<0]))
print('BGC',len(data[data['dur_bgc-trip'].dt.days<0]))
print('vehicle' ,len(data[data['dur_vadd-trip'].dt.days<0]))
Make sure there no trips until after background checks and vehicles have been added.
#edit data that claims drivers started first trip before vehicle or background checks were completed. replaced with middle of signup and first trip
#loop to fill-in missing or edit the background date column
for i, row in data.loc[(data['signup_date'].notnull()&data['first_completed_date'].notnull()&data['bgc_date'].isnull())|\
(data['dur_bgc-trip'].dt.days<0) | (data['dur_signup-bgc'].dt.days<0),:].iterrows():
a = row['signup_date']
b = row['first_completed_date']
c = a + (b - a)/2
data.at[i, 'bgc_date'] = pd.to_datetime(c)
#loop to fill-in missing or edit the vehicle added date column
for i, row in data.loc[(data['signup_date'].notnull()&data['first_completed_date'].notnull()&data['vehicle_added_date'].isnull())|\
(data['dur_vadd-trip'].dt.days<0) | (data['dur_signup-vadd'].dt.days<0),:].iterrows():
a = row['signup_date']
b = row['first_completed_date']
c = a + (b - a)/2
data.at[i, 'vehicle_added_date'] = pd.to_datetime(c)
#Note run duration cell again to update duration data
#Run cell directly above to make sure data is consistently in order.
bandv = data.loc[data['bgc_date'].notnull()&data['vehicle_added_date'].notnull()]
print('entries with background checks before vehicle added:',len(bandv.loc[data['bgc_date']<data['vehicle_added_date']]))
print('entries with background checks after vehicle added:', len(bandv.loc[data['bgc_date']>data['vehicle_added_date']]))
# calculating conversion percentages
count= data.count()
perc= round(count/data.shape[0] *100,1)
titl = f'Conversion of Signup to First-Completed-Trip<br><b>{perc[5]}%->{perc[6]}%->{perc[10]}%</b><br>(% relative to total signups)'
fig = data[['signup_date','bgc_date','vehicle_added_date','first_completed_date']].count().iplot(kind='bar',title=titl,asFigure= True)
fig['data'].update({'text':data[['signup_date','bgc_date','vehicle_added_date','first_completed_date']].count(),'textposition':'auto', 'opacity':0.8,'marker':{'color':'rgb(158,202,225)'}})
py.iplot(fig)
The largest relative drop is in vehicle_added, possibly due to:
Notes:
#copy of cell from before; recalculate duration values to cleaned data
data['dur_signup-bgc'] = data['bgc_date']-data['signup_date']
data['dur_bgc-vadd'] = data['vehicle_added_date']- data['bgc_date']
data['dur_signup-vadd'] = data['vehicle_added_date']- data['signup_date']
data['dur_bgc-trip'] = data['first_completed_date']- data['bgc_date']
data['dur_vadd-trip'] = data['first_completed_date']- data['vehicle_added_date']
data['dur_signup-trip'] = data['first_completed_date']- data['signup_date']
fig = data['dur_signup-bgc'].dt.days.iplot(asFigure = True, kind='histogram', title = 'Days to Completion: Frequency of Successful Conversion by days<br>(Hover Mouse to compare values)')
fig['layout'].update({'barmode':'stack'})
t1 = go.Histogram(x = data['dur_signup-vadd'].dt.days, name = 'dur_signup-vehAdd')
t2 = go.Histogram(x= data['dur_signup-trip'].dt.days, name = 'dur_signup-trip')
t3 = go.Histogram(x= data['dur_vadd-trip'].dt.days ,name = 'dur_vehAdd-trip')
t4 = go.Histogram(x= data['dur_bgc-trip'].dt.days ,name = 'dur_bgc-trip')
fig['data'].append(t1)
fig['data'].append(t2)
fig['data'].append(t3)
fig['data'].append(t4)
py.iplot(fig)
All duration data appears to have a right skewed distribution. Clueing in-- the sooner the applicant is motivated to move on to the next step of the process-- the more likely they are to be successful and therefore-- have a successful first trip.
data[['dur_signup-bgc','dur_signup-vadd','dur_bgc-vadd','dur_bgc-trip','dur_vadd-trip','dur_signup-trip']].corr()
Probably best to use logistic regression here. Using Cross-Validation (KFold Cross Validation, which is the best technique to apply to create the most optimally trained model).
# Your code here feel free to use multiple cells and include markdown, graphs, latex or equations as you see fit.
X = data[['city_name' , 'signup_os' , 'signup_channel', 'duration_signup-bgc', 'duration_bgc-vadd','duration_vadd-trip']]]
y = data['first_trip_completed']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5)
from sklearn.linear_model import LogisticRegression
lm = LinearRegression(normalize=True)
lm.fit(X_train,y_train)
mse = np.mean((lm.predict(X_test) - y_test)**2)
y_train_pred=lm.predict(X_train)
y_pred = lm.predict(X_test)
This model would likely perform well on brand new data with similar characteristics (mainly same time span) as the sample it was trained on. However, I imagine that for new and more recent samples of data, there will likely be a lot of variance(due to seasonality, changes outside the sample data, etc), resulting in an overfit model that is suboptimal as predicting new data.
With Uber's huge data warehouse, it would be interesting to construct a low bias training model and feed it lots of data (over 100 mil training examples on a neural network for example) and it would most likely result in a much better performing model great at forecasting future data.
# Further code or markdowns
#neural network
df['Date'] = pd.to_datetime(df['Date']) - pd.to_timedelta(7, unit='d')
df = df.groupby(['Name', pd.Grouper(key='Date', freq='W-MON')])['Quantity']
.sum()
.reset_index()
.sort_values('Date')